# =============================================================================
# =============================================================================
# # Code to create cohorts for analyzing Clarity data
# # This is slightly complicated because the clarity data can be studied from multiple perspectives (inquiry vs transaction and panel versus event)
# # Each perspective requires a dataset that has been preparred accordingly.
# #   1. Import the tcp treated control panel and select the cohorts of interest.
# #   2. Create the base cohort dataset cc which has cohorts and the targetted individuals in treated and control areas
# #   3. Subset cc for size reasons and add in controls and descriptive variables eventually creating ccss
# #   4. Create cohort inquiry full panel cifp
# #   5. Create cohort inquiry event activity cifea
# #   6. Create cohort transaction full panel ctfp
# #   7. Create cohort transaction event ctfe
# #   8. Descriptive statistics tables
# =============================================================================
# =============================================================================


# =============================================================================
# Import the tcp panel (for zip codes)
# =============================================================================
geo = ts['geog']
freq = ts['freq']
afolder = ''

tcp = pd.read_parquet(cdd['p_d_geo_tcp'] + r'\tcp_25k_zip_'+freq+'.parquet')
cloc = pd.read_parquet(cdd['p_d_cl_c'] + r'\cloc_'+freq+'.parquet')


# =============================================================================
# Subset the tcp panel to the desired radius around treated zones.
tcps = tcp[(tcp.distance<10000)|((tcp.treated==1)&(tcp.distance<10000))|((tcp.ctype==0)&(tcp.distance_t2cf<25000))].copy()                   #zip codes are within 10km of a plasma center opening (present or counterfactual)
tcps = tcps[tcps['open'] >= dt.datetime(2014,4,1)]
tcps = tcps[tcps.ctype!=0]

tcps=tcps[(tcps.groupby('cohort')['treated'].transform('max')!=tcps.groupby('cohort')['treated'].transform('min'))]


# =============================================================================
#Create the cohorts dataset.
def cohort_maker(df_tcps, df_cloc, target):
    #Get the open date, the first and last quarters studied, and the list of zipcodes of interest
    openc = df_tcps.open_qpre.unique()[0]
    geoc = list(df_tcps[geo].unique())
    
    #Select any individual who has appeared in the designated geography in the quarter of interest (prior to the opening)
    temp = df_cloc[df_cloc['id'].isin(df_cloc[(df_cloc[geo].isin(geoc))&(df_cloc['dateq']==openc)]['id'].unique())].copy()
    #Reset the geography to match the geo in period open_qpre
    # temp[geo+'_act'] = temp[geo]
    temp[geo] = np.where(temp['dateq']==openc,temp[geo],np.nan)
    temp[geo] = temp.groupby('id')[geo].transform('max')
    #Merge with the event time.
    temp = pd.merge(temp.rename(columns={'dateq':'date'}),df_tcps[[geo,'date','cohort','treated','etime']],how='left',on=[geo,'date'])
    #Subset the dataset to reflect the zip-code specific pre and post periods.
    temp = temp[~pd.isnull(temp.etime)]
    return temp

def cohort_helper(df_tcps, df_cloc, target):
    # df_tcps=pd.DataFrame(df_tcps[0],columns=df_tcps[1])
    # df_cloc=pd.DataFrame(df_cloc[0],columns=df_cloc[1])
    for t in target:
        print(t)
        cc = cohort_maker(df_tcps=df_tcps[df_tcps.cohort==t], df_cloc=df_cloc[df_cloc['id'].isin(df_cloc[df_cloc[geo].isin(df_tcps[df_tcps.cohort==t][geo].unique())]['id'].unique())], target=t)
        cc = downsize(cc,types={'id':'integer','zip':'integer','cohort':'integer','treated':'integer','etime':'integer'})
        list(cc)
        cc.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\cohorts\cohort_'+str(int(t)) + '_' +freq+'.parquet')
    return ['success']

files = os.listdir(cdd['p_d_cl_pa'] + afolder + r'\cohorts')
cohortl = ldiff(list(tcps.cohort.unique()),[int(re.findall('[0-9]+',v)[0]) for v in files])
cohortlg = split(cohortl,10)


tcpvars = [geo,'date','cohort','etime','treated','open_qpre']
clocvars = ['id', 'dateq', 'zip', 'state']
target=cohortlg[1]
df_cloc = cloc[cloc['id'].isin(cloc[cloc[geo].isin(tcps[tcps.cohort.isin(target)][geo].unique())]['id'].unique())][clocvars].copy()
df_tcps = tcps[tcps.cohort.isin(target)][tcpvars].copy()
temp = Parallel(n_jobs=5)(delayed(cohort_helper)(df_tcps = tcps[tcps.cohort.isin(target)][tcpvars].copy(),
                                                  df_cloc = cloc[cloc['id'].isin(cloc[cloc[geo].isin(tcps[tcps.cohort.isin(target)][geo].unique())]['id'].unique())][clocvars].copy(),
                                                  target=target) for target in tqdm(cohortlg))

#Read in and concatenate the cohorts
files = [f for f in os.listdir(cdd['p_d_cl_pa'] + afolder + r'\cohorts') if any(re.findall('_'+freq.upper(),f))]
cc = Parallel(n_jobs=10)(delayed(pd.read_parquet)(path=cdd['p_d_cl_pa'] + afolder +  r'\cohorts\\'+f) for f in tqdm(files))
cc = pd.concat(cc, ignore_index=True,sort=False)
cc.info(memory_usage='deep')
cc.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\cc_'+freq+'.parquet')


# =============================================================================
#Create an extract of variables from cif and ctf and cloc and merge with cohorts
#Merge with the clarity cohorts
cc = pd.read_parquet(cdd['p_d_cl_pa'] + afolder + r'\cc_'+freq+'.parquet')
cc = cc[(cc.date>=dt.datetime(2014,1,1))&(cc.date<dt.datetime(2021,1,1))]


#Cohort description variables to use for subsetting cc further (as needed)
tcpss = tcps[(tcps.distance_t2cf_rank<=tcps.groupby(['cohort'])['distance_t2cf_rank'].transform(lambda x: x.drop_duplicates().nsmallest(21).iloc[-1]))].copy()
tcpss = tcpss[(tcpss.open > dt.datetime(2014,3,31))&(tcpss.open < dt.datetime(2021,1,1))]
tcpss = tcpss[tcpss.groupby('cohort')['treated'].transform('sum')>0]
tcpss.info(memory_usage='deep')


#Update the ranking variables to facilitate subsetting.
import random as rn
for v in ['distance_t2cf_rank']:
    tcpss.sort_values(by=['cohort',v],inplace=True)
    tcpss[v] = 1 - tcpss.duplicated(subset=['cohort',v])
    tcpss[v] = tcpss.groupby('cohort')[v].cumsum()-1
#Merge the cc wit tcpss (takes about 10min)
tcpvars = ['cohort', 'zip', 'date', 'open', 'distance', 'intensity_absdelta_tcf', 'open_cf', 'cohort_cf', 
              'distance_t2cf', 'ctype', 'etime', 'treated', 'distance_t2cf_rank']
ccvars = ['id','date','zip','cohort']
ccs = pd.merge(cc[ccvars],tcpss[ldiff(tcpvars,list(ccvars)) + ['date','zip','cohort']],how='inner',on=['cohort','zip','date'])



del cc, tcp, tcps, tcpss
gc.collect()
ccs.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\ccs_'+freq+'.parquet')
ccs.info(memory_usage='deep')

#Keep only the high treatment intensity observations
ccs = pd.read_parquet(cdd['p_d_cl_pa'] + afolder + r'\ccs_'+freq+'.parquet')
ccss = ccs[(ccs.distance<5000)&(ccs.intensity_absdelta_tcf>5000)].copy()
del ccs
gc.collect()

#Import the controls (also copy the 2019 controls to use in 2020 and 2021)
zip_controls_aug = pd.read_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s3.parquet')
#Restrict the zipcode sample to zipcode that are actually in my data.
zip_controls_aug = zip_controls_aug[zip_controls_aug.zip.isin(ccss['zip'].unique())]


#Merge in the population density
dens_vars = ['zip', 'year', 'pop2sqkm_wq10', 'state']
ccss['open_ypre'] = ccss['open'].dt.year - 1
ccss = pd.merge(ccss, zip_controls_aug[dens_vars].rename(columns={'year':'open_ypre'}), how='left', on=['zip','open_ypre'])
ccss.drop(columns=['open_ypre'],inplace=True)


#Import the inquiry, transaction, and location data
ci_desc = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_desc_' + freq + '.parquet')
ci_desc=ci_desc[ci_desc.id.isin(ccss.id.unique())]


dmap = ccss[ccss.etime==-1][['open','date']].drop_duplicates()
dmap = dict(zip(dmap.open.tolist(),dmap.date.tolist()))
ccss['open_qpre'] = ccss['open'].map(dmap)
ccss = pd.merge(ccss, ci_desc.rename(columns={'date'+freq:'open_qpre'}), how='left',on=['id','open_qpre'])
ccss.drop(columns=['open_qpre'],inplace=True)
ccss.info(memory_usage='deep')

ccss['age'] = (ccss['date'] - ccss['dob']).dt.total_seconds() / (3600*24*365)
ccss['age_2g'] = np.where(~pd.isnull(ccss.age),1*(ccss.age>35),np.nan)
ccss['age_3g'] = np.where(~pd.isnull(ccss.age),1*(ccss.age>30) + 1*(ccss.age>45),np.nan)
ccss['reside'] = (ccss['date'] - ccss['dor']).dt.total_seconds() / (3600*24*365)
ccss['reside_2g'] = np.where(~pd.isnull(ccss.reside),1*(ccss.reside>36),np.nan)
ccss['reside_3g'] = np.where(~pd.isnull(ccss.reside),1*(ccss.reside>24) + 1*(ccss.reside>48),np.nan)
ccss['income_2g'] = np.where(~pd.isnull(ccss.income),1*(ccss.income>2500),np.nan)
ccss['income_3g'] = np.where(~pd.isnull(ccss.income),1*(ccss.income>2000) + 1*(ccss.income>3000),np.nan)


#Prep an Event Time Variable for the more 
if freq=='q':
    ccss['etimey'] = np.floor(ccss['etime'] / 4)
if freq=='m':
    ccss['etimey'] = np.floor(ccss['etime'] / 12)
    ccss['etimeq'] = np.floor(ccss['etime'] / 3)
ccss.rename(columns={'intensity_absdelta_tcf':'ic'},inplace=True)


ccss.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\ccss_'+freq+'.parquet')
ccss = pd.read_parquet(cdd['p_d_cl_pa'] + afolder + r'\ccss_'+freq+'.parquet')
ccss.info(memory_usage='deep')
del ci_desc, cloc, files, temp
gc.collect()




# =============================================================================
# Build dataset for inquiries (panel)
# =============================================================================
#Import the inquiry, transaction, and location data
itypes = ['C1','C2','C3','C5','C6','C13']
hor1 = [7,15,30,90,180,360]
hor2 = [3,6,9,12,15,18,21,24,27,30,45,60,90,180,270,360,540,730] #Full
hork = ['_',12,30,90,180]
cif=pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_agg_'+freq+'_s.parquet')
kvars = [v for v in list(cif) if (any(re.findall(r'('+'|'.join(itypes) + r')$',v)))&((~any(re.findall(r'_na',v)))|(any(re.findall(r'(na|nac)('+'|'.join([str(v) for v in hork]) + r')',v))))]
cif = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_agg_'+freq+'.parquet').rename(columns={'dateq':'date'})[['id','date'] + kvars]
cif.info(memory_usage='deep')

#For inquiry data require at least 1000 records of that type of inquiry.
#   I don't want to inflate the sense that the pre-period was identically zero
inq_i = [v for v in list(cif) if any(re.findall(r'('+'|'.join(itypes) + r')$',v))]
for v in tqdm(inq_i):
    cif[v] = np.where(cif.groupby(['date'])[v].transform('sum')>=1000, cif[v], np.nan)
cif=cif[cif.id.isin(ccss.id.unique())].copy()


#Merge in the inquiry activity
ccf = pd.merge(ccss,cif,how='left',on=['id','date'])
for v in tqdm(inq_i):
    ccf[v] = ccf[v].fillna(0)
    ccf[v] = np.where(ccf.groupby(['date'])[v].transform('max')>0, ccf[v], np.nan)
ccf.info(memory_usage='deep')
ccf.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\cifp_'+freq+'.parquet')


del cif, ccf
gc.collect()



# =============================================================================
# Build dataset for inquiries (events)
# =============================================================================
ci_s = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_s.parquet')
ci_ss = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_ss.parquet')
ci_s = ci_s[ci_s.id.isin(ccss.id.unique())]
ci_s = ci_s[ci_s.typea.isin(itypes)]
ci_ss = ci_ss[ci_ss.id.isin(ccss.id.unique())]


cifea = pd.merge(ci_s, ccss.rename(columns={'date':'dateq'}),how='left',on=['id','dateq'])
cife = pd.merge(ci_ss, ccss.rename(columns={'date':'dateq'}),how='left',on=['id','dateq'])
cifea.info(memory_usage='deep')
cife.info(memory_usage='deep')


cifea.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\cifea_'+freq+'.parquet')
cife.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\cife_'+freq+'.parquet')


del ci_s, ci_ss, cifea, cife
gc.collect()



# =============================================================================
# Build dataset for transactions (panel)
# =============================================================================
ttypes = ['C1','C2','C3','C5','C6','01']
# ctf = pd.read_parquet(cdd['p_d_cl_c'] + afolder + r'\ctf_'+freq+'.parquet')
files = [file for file in os.listdir(cdd['p_d_cl_c'] + r'\ctf_parts') if any(re.findall(r'_' + freq,file,flags=re.IGNORECASE))]
ctf=pd.read_parquet(cdd['p_d_cl_c'] + r'\ctf_parts\\'+files[0])
kvars = [v for v in list(ctf) if any(re.findall(r'('+'|'.join(ttypes) + r')$',v))] + \
             [ 'ntrans', 'ntransa', 'ntrans_d', 'ch_d', 'freq_d', 'ntrans_dcmean', 'ch_dcmean',
              'ntrans_dmean', 'ch_dmean', 'freq_dcmean', 'anytrans_dmean']
ctf = pd.concat(Parallel(n_jobs=5)(delayed(pd.read_parquet)(cdd['p_d_cl_c'] + r'\ctf_parts\\'+f) for f in tqdm(files)), ignore_index=True,sort=False)[['id','date']+kvars]
ctf = ctf[ctf.id.isin(ccss.id.unique())].copy()

#Merge in the inquiry activity
ccf = pd.merge(ccss[ccss.id.isin(ctf.id.unique())],ctf,how='left',on=['id','date'])
ccf[kvars] = ccf[kvars].fillna(0)

#Export the data.
ccf.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\ctfp_'+freq+'.parquet')
ccf.info(memory_usage='deep')

del ctf, ccf
gc.collect()



# =============================================================================
# Build dataset for transactions (events)
# =============================================================================
ttypes = ['C1','C2','C3','C5','C6','01']
ct = pd.read_parquet(cdd['p_d_cl_c'] + r'\ct.parquet')
ct = ct[(ct.id.isin(ccss.id.unique()))&(ct.typea.isin(ttypes))].copy().rename(columns={'openq':'dateq'})
ctf = pd.read_parquet(cdd['p_d_cl_c'] + r'\ctf_'+freq+'.parquet')
kvars = [ 'ntrans', 'ntransa', 'ntrans_d', 'ch_d', 'freq_d', 'ntrans_dcmean', 'ch_dcmean', 'ntrans_dmean', 'ch_dmean', 'freq_dcmean', 'anytrans_dmean']
ctf = ctf[ctf.id.isin(ccss.id.unique())][['id','date']+kvars].copy()


list(ct)
ccsst = ccss[ccss.id.isin(ct.id.unique())].drop(columns=['zip']).rename(columns={'date':'dateq','open':'open_coh','zip':'zip_coh'})
ccf = pd.merge(ct, ccsst,how='left',on=['id','dateq'])
dates_pre = {v:v - relativedelta(months=3) + relativedelta(day=31) for v in list(ct['dateq'].drop_duplicates())}
ccf['dateq_pre'] = ccf['dateq'].map(dates_pre)
ccf = pd.merge(ccf, ctf.rename(columns={'date':'dateq_pre'}), how='left',on=['id','dateq_pre'])

ccf.to_parquet(cdd['p_d_cl_pa'] + afolder + r'\ctfe_'+freq+'.parquet')
ccf.info(memory_usage='deep')


del ct, ctf, ccf, ccsst
gc.collect()




# =============================================================================
# Create Summary Statistics Tables for Clarity Data.
# =============================================================================
ccf = pd.read_parquet(cdd['p_d_cl_pa'] + r'\cifp_Q.parquet')#[['cohort','age','income','distance','ic','treated','open','inq_C1','inq_C2','inq_C3','inq_C6']]
ccf['inq_payday'] = ccf['inq_C1'] + ccf['inq_C2']
ccf['inq_inst'] = ccf['inq_C3'] + ccf['inq_C6']
ccf=ccf[(ccf.groupby('cohort')['treated'].transform('max')==1)&(ccf.open<dt.datetime(2020,1,1))&(ccf.open>=dt.datetime(2014,7,1))&(ccf.distance<5000)&(ccf.ic>5000)]
ccf['inq_payday_any'] = 1*(ccf['inq_payday']>0)
ccf['inq_inst_any'] = 1*(ccf['inq_inst']>0)
ccf['inq_payday_gt1'] = 1*(ccf['inq_payday']>1)
ccf['inq_inst_gt1'] = 1*(ccf['inq_inst']>1)


#Summary statistics for households within 10 kilometers of the opening or counterfactual plasma center.
ssrn = {'mean':'Mean','std':'Std. Dev.','count':'N','min':'Min','max':'Max','10%':'10 Pct','25%':'25 Pct','50%':'50 Pct','75%':'75 Pct','90%':'90 Pct','index':'Variables'}
ssvars_lab = {'inq_payday':'Inq Payday Num.','inq_inst':'Inq Inst. Num.','inq_payday_any':'Inq. Payday gt0','inq_inst_any':'Inq. Inst. gt0','inq_payday_gt1':'Inq. Payday gt1','inq_inst_gt1':'Inq. Inst. gt1','age':'Age','income':'Income'}
ss_cl_all = ccf[ssvars_lab.keys()].describe(percentiles=[0.1,0.25,0.5,0.75,0.9])
ss_cl_d50_ic50 = ccf[ssvars_lab.keys()].rename(columns=ssvars_lab).describe(percentiles=[0.1,0.25,0.5,0.75,0.9])
ss_cl_d50_ic50_ltx = ss_cl_d50_ic50.T.rename(columns=ssrn).reset_index().to_latex(index=False,index_names=False,float_format="%.3f").replace('.000','')
with open(cdd['p_rt'] + r'\Clarity\Custom_SummaryStatistics.tex', "w") as text_file:
    text_file.write(ss_cl_d50_ic50_ltx)






